# imports and plotting utility functions
%matplotlib inline
import warnings
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
import seaborn as sns
from matplotlib import pylab as plt
from statsmodels.discrete.discrete_model import Logit
from scipy.linalg import norm
warnings.simplefilter('ignore')
rf_cmp = RandomForestClassifier(n_estimators=250, bootstrap=True, oob_score=True, random_state=0)
def plot_lr(true_coefs, est_coefs, pvals, var_names=None, rf_cmp_coef=None):
n_feat = len(est_coefs)
where_sign = lr_pvalues < 0.05
plt.figure(figsize=(17, 6))
# print non-significant betas
plt.scatter(np.arange(X.shape[1]), est_coefs, s=150, color='red', label='estimated betas', alpha=0.5)
if true_coefs is not None:
plt.scatter(np.arange(X.shape[1]), true_coefs, s=150, color='black', label='true betas', alpha=0.5)
if rf_cmp_coef is not None:
plt.scatter(np.arange(X.shape[1]), rf_cmp_coef, s=150, marker='D', color='steelblue', label='RandomForest importances', alpha=0.5)
# print star significant betas and their values
axes = plt.gca()
#import pdb; pdb.set_trace()
y_min, y_max = axes.get_ylim()
axes.set_ylim(y_min * 1.25, y_max * 1.25)
sign_y = np.sum(where_sign) * [y_min]
plt.scatter(np.arange(X.shape[1])[where_sign], sign_y, color='red', label='significant at p<0.05', s=150, marker=(5, 1), alpha=0.75, linewidth=3)
for i_b, p in enumerate(pvals):
plt.text(x=i_b - 0.25, y=y_min * 1.10, s='$p$=%.3f' % p)
plt.xlabel('input variables')
if var_names is None:
plt.xticks(np.arange(n_feat), (np.arange(n_feat) + 1), fontsize=15)
else:
plt.xticks(np.arange(n_feat), var_names, fontsize=12, rotation=90)
plt.grid(True)
plt.title('Logistic regression', fontsize=16)
plt.legend(loc='upper right', fontsize=14, fancybox=True, framealpha=0.5)
def plot_regr_paths(coefs, accs, nonzeros, C_grid, var_names=None, unbiased_accs=None):
n_cols = 2
n_rows = 1
n_verticals = len(coefs)
n_feat = len(coefs)
my_palette = np.array([
'#F47D7D', '#FBEF69', '#98E466', '#000000',
'#A7794F', '#CCCCCC', '#85359C', '#FF9300', '#FF0030', 'grey', 'blue', 'salmon', '#4BBCF6',
'green', 'tomato', 'darkred', 'black', 'cyan', 'lime'
])
my_colors = np.array(['???????'] * coefs.shape[-1])
i_col = 0
new_grp_pts_x = []
new_grp_pts_y = []
new_grp_pts_col = []
new_grp_pts_total = []
for i_vertical, (params, acc, C) in enumerate(zip(
coefs, accs, C_grid)):
b_notset = my_colors == '???????'
b_nonzeros = params == 0
b_coefs_of_new_grp = np.logical_and(b_notset, b_nonzeros)
#if i_vertical >= 17:
# import pdb; pdb.set_trace()
if np.sum(b_coefs_of_new_grp) > 0:
i_col += 1
# we found a new subset that became 0
for new_i in np.where(b_coefs_of_new_grp == True)[0]:
# color all coefficients of the current group
cur_col = my_palette[i_col]
my_colors[new_i] = cur_col
new_grp_pts_x.append(C)
new_grp_pts_y.append(acc)
new_grp_pts_col.append(cur_col)
new_grp_pts_total.append(np.sum(b_nonzeros))
if var_names is None:
X_colnames = np.arange(n_feat) + 1
else:
X_colnames = var_names
subplot_xlabel = '#nonzero coefficients'
f, axarr = plt.subplots(nrows=n_rows, ncols=n_cols,
figsize=(15, 10), facecolor='white')
t, i_col = 0, 0
for i_line in range(X.shape[-1]):
axarr[i_col].plot(np.log10(C_grid)[::-1],
coefs[:, i_line], label=X_colnames[i_line],
color=my_colors[i_line], linewidth=1.5)
# axarr[0].set_xticks(np.arange(len(C_grid)))
# axarr[0].set_xticklabels(np.log10(C_grid)) #, rotation=75)
axarr[i_col].set_xlabel(subplot_xlabel, fontsize=10)
axarr[i_col].legend(loc='lower left', fontsize=11, markerscale=10, fancybox=True, framealpha=0.5)
axarr[0].grid(True)
# axarr[i_col].set_ylabel('Item groups', fontsize=16)
axarr[0].set_title('Penalized Logistic: Groups of selected variables', fontsize=16)
axarr[0].set_xticks(np.log10(C_grid)[::-1])
axarr[0].set_xticklabels(nonzeros)
# axarr[1].axis('off')
#import pdb; pdb.set_trace()
if unbiased_accs is not None:
axarr[1].scatter(np.arange(len(unbiased_accs)), unbiased_accs, color='orange',
linewidth=4, label='prediction accuracy (debiased)', zorder=10)
axarr[1].scatter(np.arange(len(accs)), accs, color='black',
linewidth=3, label='prediction accuracy', zorder=10)
# axarr[1].set_title('ACCURACY')
axarr[1].set_ylim(0, 1.05)
axarr[1].grid(True)
# axarr[1].set_xticklabels(np.log10(C_grid), '')
axarr[1].set_xticks(np.arange(n_verticals))
axarr[1].set_xticklabels(nonzeros)
axarr[1].set_xlabel(subplot_xlabel, fontsize=10)
# axarr[1].set_ylabel('Out-of-sample accuracy', fontsize=16)
axarr[1].legend(loc='lower left', fontsize=14, markerscale=1, fancybox=True, framealpha=0.5)
axarr[1].set_title('Penalized Logistic: Out-of-sample accuracy ($R^2$ score)', fontsize=16)
def corrfunc(x, y, **kws):
from scipy import stats
r, _ = stats.pearsonr(x, y)
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),
xy=(.1, .9), xycoords=ax.transAxes)
import statsmodels.api as sm
# https://github.com/statsmodels/statsmodels/issues/3931
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
# https://datascience.stackexchange.com/questions/937/does-scikit-learn-have-forward-selection-stepwise-regression-algorithm
def fwd_stepwise_selection(X, y, initial_list=[], verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame with candidate features
y - list-like with the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
"""
included = list(initial_list)
while len(included) < X.shape[1]:
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = Logit(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit(disp=0)
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
best_feature = new_pval.argmin()
included.append(best_feature)
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
return included
# statistical helper functions
def compute_Logistic_regpath(X, y, C_grid):
coef_list2 = []
acc_list2 = []
acc_unbiased_list2 = []
nonzero_list2 = []
for i_step, my_C in enumerate(C_grid):
sample_accs = []
sample_accs_unbiased = []
sample_coef = []
for i_subsample in range(100):
folder = ShuffleSplit(n=len(y), n_iter=100, test_size=0.1,
random_state=i_subsample)
train_inds, test_inds = next(iter(folder))
clf = LogisticRegression(C=my_C, random_state=i_subsample, penalty='l1')
clf.fit(X[train_inds, :], y[train_inds])
# compute out-of-sample prediction accuracy
acc = average_precision_score(
y_true=y[test_inds],
y_score=clf.predict(X[test_inds]),
average='weighted')
# get out-of-sample accuracy from unbiased linear model with selected inputs
b_vars_to_keep = np.squeeze(clf.coef_) != 0
if np.sum(b_vars_to_keep) > 0:
unbiased_lr = LogisticRegression(C=100000, random_state=i_subsample, penalty='l2')
unbiased_lr.fit(
X[train_inds, :][:, b_vars_to_keep], y[train_inds])
unbiased_acc = average_precision_score(
y_true=y[test_inds],
y_score=unbiased_lr.predict(X[test_inds][:, b_vars_to_keep]),
average='weighted')
else:
unbiased_acc = 0
sample_accs.append(acc)
sample_accs_unbiased.append(unbiased_acc)
sample_coef.append(np.squeeze(clf.coef_))
mean_coefs = np.mean(np.array(sample_coef), axis=0)
coef_list2.append(mean_coefs)
acc_at_step = np.mean(sample_accs)
acc_list2.append(acc_at_step)
acc_unbiased_list2.append(np.mean(sample_accs_unbiased))
notzero = np.count_nonzero(mean_coefs)
print("C: %.4f acc: %.2f active_coefs: %i" % (my_C, acc_at_step, notzero))
nonzero_list2.append(notzero)
return np.array(coef_list2), np.array(acc_list2), np.array(nonzero_list2), np.array(acc_unbiased_list2)
Dataset summary: These data contain a binary outcome HD for 303 patients who presented with chest pain (binary outcome).
import pandas as pd
df_heart = pd.read_csv('dataset_heart_ISL.csv').fillna(value=0)
feat_names = ['Age', u'Sex', u'RestBP', u'Chol', u'Fbs',
u'RestECG', u'MaxHR', u'ExAng', u'Oldpeak', u'Slope', u'Ca', u'Thal', u'ChestPain']
y = np.asarray(df_heart['AHD'] == 'Yes', dtype=np.int)
df_part1 = pd.DataFrame(StandardScaler().fit_transform(df_heart[feat_names[:-2]].values), columns=feat_names[:-2])
df_part2 = pd.get_dummies(df_heart[feat_names[-2:]])
#pd.concat([df_part1, df_part2], axis=1)
X = np.hstack((df_part1.values, df_part2.values))
feat_names = list(df_part1.columns) + list(df_part2.columns)
cl2_prop = np.sum(y) * 100 / len(y)
print('Balance between class 1 : 2 is %.0f%% : %.0f%%' % (100 - cl2_prop, cl2_prop))
df_heart
g = sns.pairplot(pd.DataFrame(np.hstack((X, y[:, None])), columns=feat_names + ['out']),
kind="reg", diag_kind="kde", hue='out')
g.map_lower(corrfunc)
# ordinary linear model with logit loss
model = Logit(y, X)
res = model.fit(disp=0)
lr_coefs = res.params
lr_pvalues = res.pvalues
rf_cmp.fit(X, y)
rf_cmp.feature_importances_
# compute regularization paths of L1-penalized linear model with logit loss
C_grid = np.logspace(+1, -2, 25)
coef_list, acc_list, nonzero_list, unbiased_acc_list = compute_Logistic_regpath(X, y, C_grid)
plot_lr(None, lr_coefs, lr_pvalues, feat_names, rf_cmp_coef=rf_cmp.feature_importances_ * np.mean(np.abs(lr_coefs)))
plot_regr_paths(coef_list, acc_list, nonzero_list, C_grid, feat_names, unbiased_acc_list)
sel_w_pvals = fwd_stepwise_selection(pd.DataFrame(X, columns=feat_names), y, verbose=True)
print('Forward-stepwise selection: ' + ' -> '.join(sel_w_pvals))
res.summary(xname=feat_names)
Dataset summary (ESL): A retrospective sample of males in a heart-disease high-risk region of the Western Cape, South Africa. There are roughly two controls per case of coronary heart disease. Many of the coronary heart disease positive men have undergone blood pressure reduction treatment and other programs to reduce their risk factors after their coronary heart disease event. In some cases the measurements were made after these treatments. These data are taken from a larger dataset, described in Rousseauw et al, 1983, South African Medical Journal.
Based on this data, does having a family history of coronary heart disease affect a patients chance of having coronary heart disease? Does this result change for patients younger than 40 years old? What about for patients aged 40 years or older?
sbp systolic blood pressure tobacco cumulative tobacco (kg) ldl low densiity lipoprotein cholesterol adiposity famhist family history of heart disease (Present, Absent) typea type-A behavior obesity alcohol current alcohol consumption age age at onset chd response, coronary heart disease
import pandas as pd
df_africa = pd.read_excel('dataset_south_african_heart_disease.xls')
df_africa
feat_names = ['sbp', u'tobacco', u'ldl', u'adiposity', u'famhist', u'typea',
u'obesity', u'alcohol', u'age']
df_africa['famhist'] = pd.get_dummies(df_africa['famhist'], drop_first=True)
X = StandardScaler().fit_transform(df_africa[feat_names].values)
y = df_africa['chd'].values
cl2_prop = np.sum(y) * 100 / len(y)
print('Balance between class 1 : 2 is %.0f%% : %.0f%%' % (100 - cl2_prop, cl2_prop))
g = sns.pairplot(pd.DataFrame(np.hstack((X, y[:, None])), columns=feat_names + ['out']),
hue='out')
g.map_lower(corrfunc)
# ordinary linear model with logit loss
model = Logit(y, X)
res = model.fit(disp=0)
lr_coefs = res.params
lr_pvalues = res.pvalues
rf_cmp.fit(X, y)
rf_cmp.feature_importances_
# compute regularization paths of L1-penalized linear model with logit loss
C_grid = np.logspace(+.5, -2, 25)
coef_list, acc_list, nonzero_list, unbiased_acc_list = compute_Logistic_regpath(X, y, C_grid)
plot_lr(None, lr_coefs, lr_pvalues, feat_names, rf_cmp_coef=rf_cmp.feature_importances_ * np.mean(np.abs(lr_coefs)))
plot_regr_paths(coef_list, acc_list, nonzero_list, C_grid, feat_names, unbiased_acc_list)
sel_w_pvals = fwd_stepwise_selection(pd.DataFrame(X, columns=feat_names), y, verbose=True)
print('Forward-stepwise selection: ' + ' -> '.join(sel_w_pvals))
res.summary(xname=feat_names)
Dataset summary: This dataset is from the Duke University Cardiovascular Disease Databank and consists of 3504 patients and 6 variables. The patients were referred to Duke University Medical Center for chest pain. Some interesting analyses include predicting the probability of significant (>= 75% diameter narrowing in at least one important coronary artery) coronary disease, and predicting the probability of severe coronary disease given that some significant disease is "ruled in." The first analysis would use sigdz as a response variable, and the second would use tvdlm on the subset of patients having sigdz=1. Severe coronary disease is defined as three-vessel or left main disease and is denoted by tvdlm=1. sex=0 for males, 1 for females.
3504 observations and 6 variables

import pandas as pd
df_coro = pd.read_excel('dataset_coronary_catheder.xls').dropna(how='any')
df_coro
feat_names = ['sex', 'age', 'cad.dur', 'choleste']
feat_continuous = ['age', 'cad.dur', 'choleste']
df_coro[feat_continuous] = StandardScaler().fit_transform(df_coro[feat_continuous])
#y = df_coro['tvdlm'].values
y = df_coro['sigdz'].values
df_coro = df_coro[feat_names]
X = df_coro.values
cl2_prop = np.sum(y) * 100 / len(y)
print('Balance between class 1 : 2 is %.0f%% : %.0f%%' % (100 - cl2_prop, cl2_prop))
# ordinary linear model with logit loss
model = Logit(y, X)
res = model.fit(disp=0)
lr_coefs = res.params
lr_pvalues = res.pvalues
rf_cmp.fit(X, y)
rf_cmp.feature_importances_
# compute regularization paths of L1-penalized linear model with logit loss
C_grid = np.logspace(+.01, -3, 25)
coef_list, acc_list, nonzero_list, unbiased_acc_list = compute_Logistic_regpath(X, y, C_grid)
plot_lr(None, lr_coefs, lr_pvalues, feat_names, rf_cmp_coef=rf_cmp.feature_importances_ * np.mean(np.abs(lr_coefs)))
plot_regr_paths(coef_list, acc_list, nonzero_list, C_grid, feat_names, unbiased_acc_list)
sel_w_pvals = fwd_stepwise_selection(pd.DataFrame(X, columns=feat_names), y, verbose=True)
print('Forward-stepwise selection: ' + ' -> '.join(sel_w_pvals))
res.summary(xname=feat_names)
Compare infrerence with prediction using the infpred plot from the notebook on regression problems.